在上一篇我們把任務拆成
以目前的程式,我們似乎完成了『抓取熱門看版』的程式,在進行下一個步驟前,我們先來 code review 目前的程式碼。(先只看測試就好)
<?php
namespace Recca0120\Ithome30\Tests;
use Mockery;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Request;
use PHPUnit\Framework\TestCase;
use Recca0120\Ithome30\PttCrawler;
use Psr\Http\Client\ClientInterface;
class PttCrawlerTest extends TestCase
{
public function test_fetch_board_page()
{
\VCR\VCR::turnOn();
\VCR\VCR::insertCassette('ptt_home.yaml');
/** @var Mockery\Mock|ClientInterface $httpClient */
$httpClient = Mockery::spy(new Client());
$crawler = new PttCrawler($httpClient);
$records = $crawler->all();
self::assertEquals([
'name' => 'Gossiping',
"nuser" => '8803',
'class' => '綜合',
'title' => '[八卦] 亞運李智凱、許皓鋐奪金!',
], $records[0]);
$httpClient->shouldHaveReceived('sendRequest')->once()->with(Mockery::on(function (Request $request) {
return (((string)$request->getUri()) === 'https://www.ptt.cc/bbs/hotboards.html');
}));
\VCR\VCR::eject();
\VCR\VCR::turnOff();
}
}
是不是發現我們少抓了各個看版的 url 了?好在我們有用測試保護,立刻加上補上程式碼
<?php
namespace Recca0120\Ithome30\Tests;
use Mockery;
use GuzzleHttp\Client;
use GuzzleHttp\Psr7\Request;
use PHPUnit\Framework\TestCase;
use Recca0120\Ithome30\PttCrawler;
use Psr\Http\Client\ClientInterface;
class PttCrawlerTest extends TestCase
{
public function test_fetch_board_page()
{
\VCR\VCR::turnOn();
\VCR\VCR::insertCassette('ptt_home.yaml');
/** @var Mockery\Mock|ClientInterface $httpClient */
$httpClient = Mockery::spy(new Client());
$crawler = new PttCrawler($httpClient);
$records = $crawler->all();
self::assertEquals([
'name' => 'Gossiping',
"nuser" => '8803',
'class' => '綜合',
'title' => '[八卦] 亞運李智凱、許皓鋐奪金!',
// 補測網址
'url' => 'https://www.ptt.cc/bbs/Gossiping/index.html'
], $records[0]);
$httpClient->shouldHaveReceived('sendRequest')->once()->with(Mockery::on(function (Request $request) {
return (((string)$request->getUri()) === 'https://www.ptt.cc/bbs/hotboards.html');
}));
\VCR\VCR::eject();
\VCR\VCR::turnOff();
}
}
<?php
// src/PttCrawler.php
namespace Recca0120\Ithome30;
use GuzzleHttp\Psr7\Request;
use Psr\Http\Client\ClientInterface;
class PttCrawler
{
public function __construct(private ClientInterface $httpClient)
{
}
public function all()
{
$request = new Request('GET', 'https://www.ptt.cc/bbs/hotboards.html');
$response = $this->httpClient->sendRequest($request);
$html = (string) $response->getBody();
return array_map(
fn (string $row) => $this->parseCols($row),
$this->parseRows($html)
);
}
private function parseCols($row)
{
// 補上分析網址的程式碼
preg_match('/href="(?<url>.+)"/', $row, $matched);
preg_match_all('/"board-(?<name>\w+)">(?<value>.+?)<\/div>/', $row, $matches);
$cols = ['url' => 'https://www.ptt.cc' . $matched['url']];
foreach (array_keys($matches[0]) as $index) {
$name = $matches['name'][$index];
$value = $matches['value'][$index];
$cols[$name] = str_replace('◎', '', strip_tags($value));
}
return $cols;
}
private function parseRows($html)
{
preg_match_all('/<a\sclass="board"[^>]*>.+?<\/a>/s', $html, $matches);
return $matches[0];
}
}
這樣是不是很輕鬆的就把欠缺的程式給補上了呢?
再接著往下一步之前我們先停下來思考 PttCrawler 要怎麼用?以目前程式碼接著寫下去可能會發展成
$crawler = new PttCrawler();
$boardCrawler = new PttBoardCrawler();
foreach ($crawler->all() as $borad) {
foreach ($boardCrawler->get($board) as $articles) {
// ...
}
}
但這樣的程式會非常的『難用』!所以我們在這時候先重新規畫一下我們程式的用法
$crawler = new PttCrawler();
// 抓取所有看版及文章
$crawler->all();
// 抓取特定看版及看版文章
$crawler->get($boardName);
似乎這樣的程式比較容易使用,所以我們在下一篇的時候先花一些時間改寫一下我們目前的程式碼,可以讓我們接下來的開發更為順暢